

<!DOCTYPE html>


<html lang="en" data-content_root="" >

  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />

    <title>ONNX Export &#8212; Brevitas Documentation - v0.10.0</title>
  
  
  
  <script data-cfasync="false">
    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
  </script>
  
  <!-- Loaded before other Sphinx assets -->
  <link href="../_static/styles/theme.css?digest=3ee479438cf8b5e0d341" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=3ee479438cf8b5e0d341" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=3ee479438cf8b5e0d341" rel="stylesheet" />

  
  <link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=3ee479438cf8b5e0d341" rel="stylesheet" />
  <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />

    <link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
    <link rel="stylesheet" type="text/css" href="../_static/sg_gallery.css" />
    <link rel="stylesheet" type="text/css" href="../_static/nbsphinx-code-cells.css" />
  
  <!-- Pre-loaded scripts that we'll load fully later -->
  <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=3ee479438cf8b5e0d341" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=3ee479438cf8b5e0d341" />
  <script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=3ee479438cf8b5e0d341"></script>

    <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
    <script src="../_static/jquery.js"></script>
    <script src="../_static/underscore.js"></script>
    <script src="../_static/_sphinx_javascript_frameworks_compat.js"></script>
    <script src="../_static/doctools.js"></script>
    <script src="../_static/sphinx_highlight.js"></script>
    <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
    <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
    <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
    <script>DOCUMENTATION_OPTIONS.pagename = 'tutorials/onnx_export';</script>
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.15.3';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = 'https://xilinx.github.io/brevitas/dev/_static/versions.json';
        DOCUMENTATION_OPTIONS.theme_switcher_version_match = 'v0.10.0';
        DOCUMENTATION_OPTIONS.show_version_warning_banner = false;
        </script>
    <link rel="author" title="About these documents" href="../about.html" />
    <link rel="index" title="Index" href="../genindex.html" />
    <link rel="search" title="Search" href="../search.html" />
    <link rel="next" title="Settings" href="../settings.html" />
    <link rel="prev" title="Quantized RNNs and LSTMs" href="quant_recurrent.html" />
  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
  </head>
  
  
  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">

  
  
  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
  
  <div id="pst-scroll-pixel-helper"></div>
  
  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
    <i class="fa-solid fa-arrow-up"></i>Back to top</button>

  
  <input type="checkbox"
          class="sidebar-toggle"
          id="pst-primary-sidebar-checkbox"/>
  <label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
  
  <input type="checkbox"
          class="sidebar-toggle"
          id="pst-secondary-sidebar-checkbox"/>
  <label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
  
  <div class="search-button__wrapper">
    <div class="search-button__overlay"></div>
    <div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
      action="../search.html"
      method="get">
  <i class="fa-solid fa-magnifying-glass"></i>
  <input type="search"
         class="form-control"
         name="q"
         id="search-input"
         placeholder="Search the docs ..."
         aria-label="Search the docs ..."
         autocomplete="off"
         autocorrect="off"
         autocapitalize="off"
         spellcheck="false"/>
  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
  </div>

  <div class="pst-async-banner-revealer d-none">
  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>

  
    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
<div class="bd-header__inner bd-page-width">
  <button class="sidebar-toggle primary-toggle" aria-label="Site navigation">
    <span class="fa-solid fa-bars"></span>
  </button>
  
  
  <div class="col-lg-3 navbar-header-items__start">
    
      <div class="navbar-item">

  

<a class="navbar-brand logo" href="../index.html">
  
  
  
  
  
    
    
      
    
    
    <img src="../_static/brevitas_logo_black.svg" class="logo__image only-light" alt="Brevitas Documentation - v0.10.0 - Home"/>
    <script>document.write(`<img src="../_static/brevitas_logo_white.svg" class="logo__image only-dark" alt="Brevitas Documentation - v0.10.0 - Home"/>`);</script>
  
  
</a></div>
    
  </div>
  
  <div class="col-lg-9 navbar-header-items">
    
    <div class="me-auto navbar-header-items__center">
      
        <div class="navbar-item">
<nav class="navbar-nav">
  <ul class="bd-navbar-elements navbar-nav">
    
<li class="nav-item pst-header-nav-item">
  <a class="nav-link nav-internal" href="../setup.html">
    Setup
  </a>
</li>


<li class="nav-item pst-header-nav-item">
  <a class="nav-link nav-internal" href="../getting_started.html">
    Getting Started
  </a>
</li>


<li class="nav-item pst-header-nav-item current active">
  <a class="nav-link nav-internal" href="index.html">
    Tutorials
  </a>
</li>


<li class="nav-item pst-header-nav-item">
  <a class="nav-link nav-internal" href="../settings.html">
    Settings
  </a>
</li>


<li class="nav-item pst-header-nav-item">
  <a class="nav-link nav-internal" href="../api_reference/index.html">
    API reference
  </a>
</li>


<li class="nav-item pst-header-nav-item">
  <a class="nav-link nav-internal" href="../architecture.html">
    Architecture
  </a>
</li>


<li class="nav-item pst-header-nav-item">
  <a class="nav-link nav-internal" href="../faq.html">
    FAQ
  </a>
</li>


<li class="nav-item pst-header-nav-item">
  <a class="nav-link nav-internal" href="../about.html">
    About
  </a>
</li>

  </ul>
</nav></div>
      
    </div>
    
    
    <div class="navbar-header-items__end">
      
        <div class="navbar-item navbar-persistent--container">
          

 <script>
 document.write(`
   <button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <i class="fa-solid fa-magnifying-glass"></i>
    <span class="search-button__default-text">Search</span>
    <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
   </button>
 `);
 </script>
        </div>
      
      
        <div class="navbar-item">

<script>
document.write(`
  <button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
    <span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
    <span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
  </button>
`);
</script></div>
      
    </div>
    
  </div>
  
  
    <div class="navbar-persistent--mobile">

 <script>
 document.write(`
   <button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <i class="fa-solid fa-magnifying-glass"></i>
    <span class="search-button__default-text">Search</span>
    <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
   </button>
 `);
 </script>
    </div>
  

  
    <button class="sidebar-toggle secondary-toggle" aria-label="On this page">
      <span class="fa-solid fa-outdent"></span>
    </button>
  
</div>

    </header>
  

  <div class="bd-container">
    <div class="bd-container__inner bd-page-width">
      
      
      
      <div class="bd-sidebar-primary bd-sidebar">
        

  
  <div class="sidebar-header-items sidebar-primary__section">
    
    
      <div class="sidebar-header-items__center">
        
          
          
            <div class="navbar-item">
<nav class="navbar-nav">
  <ul class="bd-navbar-elements navbar-nav">
    
<li class="nav-item pst-header-nav-item">
  <a class="nav-link nav-internal" href="../setup.html">
    Setup
  </a>
</li>


<li class="nav-item pst-header-nav-item">
  <a class="nav-link nav-internal" href="../getting_started.html">
    Getting Started
  </a>
</li>


<li class="nav-item pst-header-nav-item current active">
  <a class="nav-link nav-internal" href="index.html">
    Tutorials
  </a>
</li>


<li class="nav-item pst-header-nav-item">
  <a class="nav-link nav-internal" href="../settings.html">
    Settings
  </a>
</li>


<li class="nav-item pst-header-nav-item">
  <a class="nav-link nav-internal" href="../api_reference/index.html">
    API reference
  </a>
</li>


<li class="nav-item pst-header-nav-item">
  <a class="nav-link nav-internal" href="../architecture.html">
    Architecture
  </a>
</li>


<li class="nav-item pst-header-nav-item">
  <a class="nav-link nav-internal" href="../faq.html">
    FAQ
  </a>
</li>


<li class="nav-item pst-header-nav-item">
  <a class="nav-link nav-internal" href="../about.html">
    About
  </a>
</li>

  </ul>
</nav></div>
          
        
      </div>
    
    
    
      <div class="sidebar-header-items__end">
        
          <div class="navbar-item">

<script>
document.write(`
  <button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
    <span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
    <span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
  </button>
`);
</script></div>
        
      </div>
    
  </div>
  
    <div class="sidebar-primary-items__start sidebar-primary__section">
        <div class="sidebar-primary-item">
<nav class="bd-docs-nav bd-links"
     aria-label="Section Navigation">
  <p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p>
  <div class="bd-toc-item navbar-nav"><p aria-level="2" class="caption" role="heading"><span class="caption-text">Tutorials:</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="tvmcon2021.html">Brevitas TVMCon 2021 tutorial</a></li>









<li class="toctree-l1"><a class="reference internal" href="quant_tensor_quant_conv2d_overview.html">An overview of QuantTensor and QuantConv2d</a></li>
<li class="toctree-l1"><a class="reference internal" href="quant_activation_overview.html">An Overview of Quantized Activations</a></li>
<li class="toctree-l1"><a class="reference internal" href="anatomy_quantizer.html">Anatomy of a Quantizer</a></li>
<li class="toctree-l1"><a class="reference internal" href="quant_recurrent.html">Quantized RNNs and LSTMs</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">ONNX Export</a></li>
</ul>
</div>
</nav></div>
    </div>
  
  
  <div class="sidebar-primary-items__end sidebar-primary__section">
  </div>
  
  <div id="rtd-footer-container"></div>


      </div>
      
      <main id="main-content" class="bd-main" role="main">
        
        
          <div class="bd-content">
            <div class="bd-article-container">
              
              <div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
  
    <div class="header-article-items__start">
      
        <div class="header-article-item">



<nav aria-label="Breadcrumb" class="d-print-none">
  <ul class="bd-breadcrumbs">
    
    <li class="breadcrumb-item breadcrumb-home">
      <a href="../index.html" class="nav-link" aria-label="Home">
        <i class="fa-solid fa-home"></i>
      </a>
    </li>
    
    <li class="breadcrumb-item"><a href="index.html" class="nav-link">Tutorials</a></li>
    
    <li class="breadcrumb-item active" aria-current="page">ONNX Export</li>
  </ul>
</nav>
</div>
      
    </div>
  
  
</div>
</div>
              
              
              
                
<div id="searchbox"></div>
                <article class="bd-article">
                  
  <section id="ONNX-Export">
<h1>ONNX Export<a class="headerlink" href="#ONNX-Export" title="Permalink to this heading">#</a></h1>
<section id="Requirements">
<h2>Requirements<a class="headerlink" href="#Requirements" title="Permalink to this heading">#</a></h2>
<p>Brevitas requires Python 3.8+ and PyTorch 1.9.1+ and can be installed from PyPI with <code class="docutils literal notranslate"><span class="pre">pip</span> <span class="pre">install</span> <span class="pre">brevitas</span></code>.</p>
<p>For this notebook, you will also need to install <code class="docutils literal notranslate"><span class="pre">onnx</span></code>, <code class="docutils literal notranslate"><span class="pre">onnxruntime</span></code>, <code class="docutils literal notranslate"><span class="pre">onnxoptimizer</span></code> and <code class="docutils literal notranslate"><span class="pre">netron</span></code> (for visualization of ONNX models). For this tutorial, PyTorch 1.8.1+ is required.</p>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[ ]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="o">%</span><span class="k">pip</span> install netron
</pre></div>
</div>
</div>
</section>
<section id="Introduction">
<h2>Introduction<a class="headerlink" href="#Introduction" title="Permalink to this heading">#</a></h2>
<p>The main goal of this notebook is to show how to use Brevitas to export your models in the two standards currently supported by ONNX for quantized models: QCDQ and QOps (i.e., <code class="docutils literal notranslate"><span class="pre">QLinearConv</span></code>, <code class="docutils literal notranslate"><span class="pre">QLinearMatMul</span></code>). Once exported, these models can be run using onnxruntime.</p>
<p>This notebook doesn’t cover QONNX, a custom extension over ONNX with more features for quantization representation that Brevitas can generate as export, which requires the <code class="docutils literal notranslate"><span class="pre">qonnx</span></code> library.</p>
</section>
<section id="QuantizeLinear-Clip-DeQuantizeLinear-(QCDQ)">
<h2>QuantizeLinear-Clip-DeQuantizeLinear (QCDQ)<a class="headerlink" href="#QuantizeLinear-Clip-DeQuantizeLinear-(QCDQ)" title="Permalink to this heading">#</a></h2>
<p>QCDQ is a style of representation introduced by Brevitas that extends the standard QDQ representation for quantization in ONNX. In Q(C)DQ export, before each operation, two (or three, in case of clipping) extra ONNX nodes are added: - <code class="docutils literal notranslate"><span class="pre">QuantizeLinear</span></code>: Takes as input a FP tensor, and quantizes it with a given zero-point and scale factor. It returns an (U)Int8 tensor. - <code class="docutils literal notranslate"><span class="pre">Clip</span></code> (Optional): Takes as input an INT8 tensor, and, given ntenger min/max values, restricts its range. -
<code class="docutils literal notranslate"><span class="pre">DeQuantizeLinear</span></code>: Takes as input an INT8 tensor, and converts it to its FP equivalent with a given zero-point and scale factor.</p>
<p>There are several implications associated with this set of operations: - It is not possible to quantize with a bit-width higher than 8. Although <code class="docutils literal notranslate"><span class="pre">DequantizeLinear</span></code> supports both (U)Int8 and Int32 as input, currently <code class="docutils literal notranslate"><span class="pre">QuantizeLinear</span></code> can only output (U)Int8. - Using only <code class="docutils literal notranslate"><span class="pre">QuantizeLinear</span></code> and <code class="docutils literal notranslate"><span class="pre">DeDuantizeLinear</span></code>, it is possible only to quantize to 8 bit (signed or unsigned). - The addition of the <code class="docutils literal notranslate"><span class="pre">Clip</span></code> function between <code class="docutils literal notranslate"><span class="pre">QuantizeLinear</span></code> and <code class="docutils literal notranslate"><span class="pre">DeQuantizeLinear</span></code>, allows to quantize a
tensor to bit-width &lt; 8. This is done by Clipping the Int8 tensor coming out of the <code class="docutils literal notranslate"><span class="pre">QuantizeLinear</span></code> node with the min/max values of the desired bit-width (e.g., for unsigned 3 bit, <code class="docutils literal notranslate"><span class="pre">min_val</span> <span class="pre">=</span> <span class="pre">0</span></code> and <code class="docutils literal notranslate"><span class="pre">max_val</span> <span class="pre">=</span> <span class="pre">7</span></code>). - It is possible to perform both per-tensor and per-channel quantization (requires ONNX Opset &gt;=13).</p>
<p>We will go through all these cases with some examples.</p>
<section id="Basic-Example">
<h3>Basic Example<a class="headerlink" href="#Basic-Example" title="Permalink to this heading">#</a></h3>
<p>First, we will look at <code class="docutils literal notranslate"><span class="pre">brevitas.nn.QuantLinear</span></code>, a quantized alternative to <code class="docutils literal notranslate"><span class="pre">torch.nn.Linear</span></code>. Similar considerations can also be used for <code class="docutils literal notranslate"><span class="pre">QuantConv1d</span></code>, <code class="docutils literal notranslate"><span class="pre">QuantConv2d</span></code>, <code class="docutils literal notranslate"><span class="pre">QuantConvTranspose1d</span></code> and <code class="docutils literal notranslate"><span class="pre">QuantConvTranspose2d</span></code>.</p>
<p>Brevitas offers several API to export Pytorch modules into several different formats, all sharing the same interface. The three required arguments are: - The PyTorch model to export - A representative input tensor (or a tuple of input args) - The path where to save the exported model</p>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[1]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">netron</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">time</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">IPython.display</span><span class="w"> </span><span class="kn">import</span> <span class="n">IFrame</span>

<span class="k">def</span><span class="w"> </span><span class="nf">show_netron</span><span class="p">(</span><span class="n">model_path</span><span class="p">,</span> <span class="n">port</span><span class="p">):</span>
    <span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mf">3.</span><span class="p">)</span>
    <span class="n">netron</span><span class="o">.</span><span class="n">start</span><span class="p">(</span><span class="n">model_path</span><span class="p">,</span> <span class="n">address</span><span class="o">=</span><span class="p">(</span><span class="s2">&quot;localhost&quot;</span><span class="p">,</span> <span class="n">port</span><span class="p">),</span> <span class="n">browse</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
    <span class="k">return</span> <span class="n">IFrame</span><span class="p">(</span><span class="n">src</span><span class="o">=</span><span class="sa">f</span><span class="s2">&quot;http://localhost:</span><span class="si">{</span><span class="n">port</span><span class="si">}</span><span class="s2">/&quot;</span><span class="p">,</span> <span class="n">width</span><span class="o">=</span><span class="s2">&quot;100%&quot;</span><span class="p">,</span> <span class="n">height</span><span class="o">=</span><span class="mi">400</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">brevitas.nn</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">qnn</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">brevitas.export</span><span class="w"> </span><span class="kn">import</span> <span class="n">export_onnx_qcdq</span>

<span class="n">IN_CH</span> <span class="o">=</span> <span class="mi">3</span>
<span class="n">OUT_CH</span> <span class="o">=</span> <span class="mi">128</span>
<span class="n">BATCH_SIZE</span> <span class="o">=</span> <span class="mi">1</span>

<span class="n">linear</span> <span class="o">=</span> <span class="n">qnn</span><span class="o">.</span><span class="n">QuantLinear</span><span class="p">(</span><span class="n">IN_CH</span><span class="p">,</span> <span class="n">OUT_CH</span><span class="p">,</span> <span class="n">bias</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">inp</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">BATCH_SIZE</span><span class="p">,</span> <span class="n">IN_CH</span><span class="p">)</span>
<span class="n">path</span> <span class="o">=</span> <span class="s1">&#39;quant_linear_qcdq.onnx&#39;</span>

<span class="n">exported_model</span> <span class="o">=</span> <span class="n">export_onnx_qcdq</span><span class="p">(</span><span class="n">linear</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="n">inp</span><span class="p">,</span> <span class="n">export_path</span><span class="o">=</span><span class="n">path</span><span class="p">,</span> <span class="n">opset_version</span><span class="o">=</span><span class="mi">13</span><span class="p">)</span>
<br/></pre></div>
</div>
</div>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">show_netron</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="mi">8082</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
Serving &#39;quant_linear_qcdq.onnx&#39; at http://localhost:8082
</pre></div></div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[3]:
</pre></div>
</div>
<div class="output_area rendered_html docutils container">
<iframe
    width="100%"
    height="400"
    src="http://localhost:8082/"
    frameborder="0"
    allowfullscreen

></iframe></div>
</div>
<p>As it can be seen from the exported ONNX, by default in <code class="docutils literal notranslate"><span class="pre">QuantLinear</span></code> only the weights are quantized, and they go through a Quantize/DequantizeLinear before being used for the <code class="docutils literal notranslate"><span class="pre">Gemm</span></code> operation. Moreover, there is a clipping operation that sets the min/max values for the tensor to ±127. This is because in Brevitas the default weight quantizer (but not the activation one) has the option <code class="docutils literal notranslate"><span class="pre">narrow_range=True</span></code>. This option, in case of signed quantization, makes sure that the quantization
interval is perfectly symmetric (otherwise, the minimum integer would be -128), so that it can absorb sign changes (e.g. from batch norm fusion).</p>
<p>The input and bias remains in floating point. In QCDQ export this is not a problem since the weights, that are quantized at 8 bit, are dequantized to floating-point before passed as input to the <code class="docutils literal notranslate"><span class="pre">Gemm</span></code> node.</p>
</section>
<section id="Complete-Model">
<h3>Complete Model<a class="headerlink" href="#Complete-Model" title="Permalink to this heading">#</a></h3>
<p>A similar approach can be used with entire Pytorch models, rather than single layer.</p>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[4]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">QuantModel</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">linear</span> <span class="o">=</span> <span class="n">qnn</span><span class="o">.</span><span class="n">QuantLinear</span><span class="p">(</span><span class="n">IN_CH</span><span class="p">,</span> <span class="n">OUT_CH</span><span class="p">,</span> <span class="n">bias</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">weight_scaling_per_output_channel</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">act</span> <span class="o">=</span> <span class="n">qnn</span><span class="o">.</span><span class="n">QuantReLU</span><span class="p">()</span>

    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">inp</span><span class="p">):</span>
        <span class="n">inp</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">linear</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span>
        <span class="n">inp</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">act</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span>
        <span class="k">return</span> <span class="n">inp</span>

<span class="n">model</span> <span class="o">=</span> <span class="n">QuantModel</span><span class="p">()</span>
<span class="n">inp</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">BATCH_SIZE</span><span class="p">,</span> <span class="n">IN_CH</span><span class="p">)</span>
<span class="n">path</span> <span class="o">=</span> <span class="s1">&#39;quant_model_qcdq.onnx&#39;</span>

<span class="n">exported_model</span> <span class="o">=</span> <span class="n">export_onnx_qcdq</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="n">inp</span><span class="p">,</span> <span class="n">export_path</span><span class="o">=</span><span class="n">path</span><span class="p">,</span> <span class="n">opset_version</span><span class="o">=</span><span class="mi">13</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[5]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">show_netron</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="mi">8083</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
Serving &#39;quant_model_qcdq.onnx&#39; at http://localhost:8083
</pre></div></div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[5]:
</pre></div>
</div>
<div class="output_area rendered_html docutils container">
<iframe
    width="100%"
    height="400"
    src="http://localhost:8083/"
    frameborder="0"
    allowfullscreen

></iframe></div>
</div>
<p>We did not specify the argument <code class="docutils literal notranslate"><span class="pre">output_quant</span></code> in our <code class="docutils literal notranslate"><span class="pre">QuantLinear</span></code> layer, thus the output of the layer will be passed directly to the ReLU function without any intermediate re-quantization step.</p>
<p>Furthermore, we have defined a per-channel quantization, so the scale factor will be a Tensor rather than a scalar (ONNX opset &gt;= 13 is required for this).</p>
<p>Finally, since we are using a <code class="docutils literal notranslate"><span class="pre">QuantReLU</span></code> with default initialization, the output is re-quantized as an UInt8 Tensor.</p>
</section>
<section id="The-C-in-QCDQ-(Bitwidth-&lt;=-8)">
<h3>The C in QCDQ (Bitwidth &lt;= 8)<a class="headerlink" href="#The-C-in-QCDQ-(Bitwidth-<=-8)" title="Permalink to this heading">#</a></h3>
<p>As mentioned, Brevitas export expands on the basic QDQ format by adding the <code class="docutils literal notranslate"><span class="pre">Clip</span></code> operation.</p>
<p>This operations is inserted between the <code class="docutils literal notranslate"><span class="pre">QuantizeLinear</span></code> and <code class="docutils literal notranslate"><span class="pre">DeQuantizeLinear</span></code> node, and thus operates on integers.</p>
<p>Normally, using only the QDQ format, it would be impossible to export models quantize with less than 8 bit.</p>
<p>In Brevitas however, if a quantized layer with bit-width &lt;= 8 is exported, the Clip node will be automatically inserted, with the min/max values computed based on the particular type of quantized performed (i.e., signed vs unsigned, narrow range vs no narrow range, etc.).</p>
<p>Even though the Tensor data type will still be a Int8 or UInt8, its values are restricted to the desired bit-width.</p>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[6]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">Model</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">linear</span> <span class="o">=</span> <span class="n">qnn</span><span class="o">.</span><span class="n">QuantLinear</span><span class="p">(</span><span class="n">IN_CH</span><span class="p">,</span> <span class="n">OUT_CH</span><span class="p">,</span> <span class="n">bias</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">weight_bit_width</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">act</span> <span class="o">=</span> <span class="n">qnn</span><span class="o">.</span><span class="n">QuantReLU</span><span class="p">(</span><span class="n">bit_width</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>

    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">inp</span><span class="p">):</span>
        <span class="n">inp</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">linear</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span>
        <span class="n">inp</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">act</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span>
        <span class="k">return</span> <span class="n">inp</span>

<span class="n">model</span> <span class="o">=</span> <span class="n">Model</span><span class="p">()</span>
<span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>

<span class="n">inp</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">BATCH_SIZE</span><span class="p">,</span> <span class="n">IN_CH</span><span class="p">)</span>
<span class="n">path</span> <span class="o">=</span> <span class="s1">&#39;quant_model_3b_4b_qcdq.onnx&#39;</span>

<span class="n">exported_model</span> <span class="o">=</span> <span class="n">export_onnx_qcdq</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="n">inp</span><span class="p">,</span> <span class="n">export_path</span><span class="o">=</span><span class="n">path</span><span class="p">,</span> <span class="n">opset_version</span><span class="o">=</span><span class="mi">13</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[7]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">show_netron</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="mi">8084</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
Serving &#39;quant_model_3b_4b_qcdq.onnx&#39; at http://localhost:8084
</pre></div></div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[7]:
</pre></div>
</div>
<div class="output_area rendered_html docutils container">
<iframe
    width="100%"
    height="400"
    src="http://localhost:8084/"
    frameborder="0"
    allowfullscreen

></iframe></div>
</div>
<p>As can be seen from the generated ONNX, the weights of the <code class="docutils literal notranslate"><span class="pre">QuantLinear</span></code> layer are clipped between -3 and 3, considering that we are performing a signed 3 bit quantization, with <code class="docutils literal notranslate"><span class="pre">narrow_range=True</span></code>.</p>
<p>Similarly, the output of the QuantReLU is clipped between 0 and 15, since in this case we are doing an unsigned 4 bit quantization.</p>
</section>
</section>
<section id="QOps-Export">
<h2>QOps Export<a class="headerlink" href="#QOps-Export" title="Permalink to this heading">#</a></h2>
<p>Another supported style for exporting quantized operation in ONNX is represented by QOps.</p>
<p>Compared to QCDQ, where it is possible to re-use standard floating point nodes (e.g., GEMM or Conv2d) preceeded by QCDQ nodes, with QOps the entire layer is replaced by its quantized counterpart.</p>
<p>Opposite to what happens with QCDQ, all elements of the computation in this case have to be quantized: Input, Weight, Bias (if present), and Output tensors.</p>
<p>This introduces some contraints on how we define our quantized layers through Brevitas.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">brevitas.quant.scaled_int</span><span class="w"> </span><span class="kn">import</span> <span class="n">Int8ActPerTensorFloat</span><span class="p">,</span> <span class="n">Int32Bias</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">brevitas.export</span><span class="w"> </span><span class="kn">import</span> <span class="n">export_onnx_qop</span>

<span class="n">IN_CH</span> <span class="o">=</span> <span class="mi">3</span>
<span class="n">IMG_SIZE</span> <span class="o">=</span> <span class="mi">128</span>
<span class="n">OUT_CH</span> <span class="o">=</span> <span class="mi">128</span>
<span class="n">BATCH_SIZE</span> <span class="o">=</span> <span class="mi">1</span>

<span class="k">class</span><span class="w"> </span><span class="nc">Model</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">input_quant</span> <span class="o">=</span> <span class="n">qnn</span><span class="o">.</span><span class="n">QuantIdentity</span><span class="p">(</span><span class="n">return_quant_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">linear</span> <span class="o">=</span> <span class="n">qnn</span><span class="o">.</span><span class="n">QuantConv2d</span><span class="p">(</span>
            <span class="n">IN_CH</span><span class="p">,</span> <span class="n">OUT_CH</span><span class="p">,</span> <span class="n">kernel_size</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">bias</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
            <span class="n">weight_bit_width</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">bias_quant</span><span class="o">=</span><span class="n">Int32Bias</span><span class="p">,</span>
            <span class="n">output_quant</span><span class="o">=</span><span class="n">Int8ActPerTensorFloat</span><span class="p">)</span>

    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">inp</span><span class="p">):</span>
        <span class="n">inp</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_quant</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span>
        <span class="n">inp</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">linear</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span>
        <span class="k">return</span> <span class="n">inp</span>

<span class="n">inp</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">BATCH_SIZE</span><span class="p">,</span> <span class="n">IN_CH</span><span class="p">,</span> <span class="n">IMG_SIZE</span><span class="p">,</span> <span class="n">IMG_SIZE</span><span class="p">)</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">Model</span><span class="p">()</span>
<span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>


<span class="n">export_onnx_qop</span><span class="p">(</span>
    <span class="n">model</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="n">inp</span><span class="p">,</span> <span class="n">export_path</span><span class="o">=</span><span class="s2">&quot;quant_model_qop.onnx&quot;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area stderr docutils container">
<div class="highlight"><pre>
c:\users\alessand\documents\brevitas\src\brevitas\export\onnx\standard\manager.py:23: UserWarning: ONNX opset version set to 13, override with opset_version=
  warnings.warn(f&#34;ONNX opset version set to {DEFAULT_OPSET}, override with {ka}=&#34;)
</pre></div></div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]:
</pre></div>
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
ir_version: 7
producer_name: &#34;pytorch&#34;
producer_version: &#34;1.13.1&#34;
graph {
  node {
    output: &#34;/input_quant/export_handler/Constant_output_0&#34;
    name: &#34;/input_quant/export_handler/Constant&#34;
    op_type: &#34;Constant&#34;
    attribute {
      name: &#34;value&#34;
      t {
        data_type: 1
        raw_data: &#34;\000\000\000&lt;&#34;
      }
      type: TENSOR
    }
  }
  node {
    output: &#34;/input_quant/export_handler/Constant_1_output_0&#34;
    name: &#34;/input_quant/export_handler/Constant_1&#34;
    op_type: &#34;Constant&#34;
    attribute {
      name: &#34;value&#34;
      t {
        data_type: 3
        raw_data: &#34;\000&#34;
      }
      type: TENSOR
    }
  }
  node {
    input: &#34;inp.1&#34;
    input: &#34;/input_quant/export_handler/Constant_output_0&#34;
    input: &#34;/input_quant/export_handler/Constant_1_output_0&#34;
    output: &#34;/input_quant/export_handler/QuantizeLinear_output_0&#34;
    name: &#34;/input_quant/export_handler/QuantizeLinear&#34;
    op_type: &#34;QuantizeLinear&#34;
  }
  node {
    output: &#34;/linear/export_handler/Constant_output_0&#34;
    name: &#34;/linear/export_handler/Constant&#34;
    op_type: &#34;Constant&#34;
    attribute {
      name: &#34;value&#34;
      t {
        dims: 128
        dims: 3
        dims: 3
        dims: 3
        data_type: 3
        raw_data: &#34;\374\372\376\374\005\000\375\374\004\375\373\373\375\007\376\374\377\000\000\373\373\004\005\371\003\375\004\373\004\374\000\006\002\003\003\005\004\377\005\000\373\376\375\376\002\376\004\377\003\005\375\371\006\373\003\007\377\374\005\375\375\006\375\377\374\001\005\371\006\005\007\376\376\372\376\004\001\374\002\373\373\376\002\376\375\377\001\376\006\371\002\000\004\005\005\000\004\373\004\002\003\000\374\376\005\000\004\372\004\000\373\000\006\377\002\005\004\005\374\000\007\377\374\371\373\007\004\376\372\001\005\001\372\377\003\001\375\006\372\377\006\003\006\004\001\004\372\005\006\003\376\373\374\375\376\005\000\004\377\372\373\000\007\377\373\003\373\376\374\374\377\375\377\003\372\005\004\007\003\375\377\001\007\377\373\374\000\377\376\374\373\377\373\375\003\004\004\376\004\377\375\003\003\377\004\000\005\004\000\372\005\007\003\004\377\373\003\371\373\002\377\006\006\007\377\376\375\002\006\005\004\374\002\000\373\004\002\002\374\371\372\371\375\001\004\000\006\376\377\002\000\372\001\001\375\007\376\005\001\373\003\374\005\003\007\005\372\004\006\375\005\003\001\373\376\374\002\376\377\376\000\006\001\375\376\377\374\000\005\002\005\006\371\375\005\375\376\374\004\001\003\001\372\005\007\371\005\000\372\001\001\371\007\374\372\373\373\372\376\004\000\002\375\376\000\004\003\003\375\003\001\376\006\001\000\372\374\376\373\002\002\004\372\377\374\005\000\001\005\005\374\007\003\377\377\000\007\002\377\377\377\374\001\001\376\000\377\373\001\004\376\003\000\007\005\000\374\372\376\005\003\003\004\372\375\372\377\006\376\374\007\373\002\374\003\377\374\002\007\373\004\376\004\004\003\005\373\003\005\376\001\000\002\371\376\000\374\377\372\375\005\373\002\373\373\377\004\375\006\377\005\005\002\375\375\003\376\376\006\002\371\000\002\373\000\006\002\372\372\006\374\372\004\006\004\000\003\001\377\371\376\003\003\373\005\000\001\003\004\001\005\001\004\373\373\372\002\371\375\372\004\377\005\375\376\374\375\003\372\001\373\372\376\005\003\372\004\373\004\374\374\376\376\377\371\375\004\375\377\376\007\004\372\000\007\372\006\002\006\001\006\372\004\004\003\002\375\006\374\002\001\001\000\376\376\006\373\374\002\372\005\374\004\004\001\374\004\377\373\002\376\001\377\003\377\007\004\372\371\002\375\377\373\002\376\375\377\006\001\001\000\374\001\006\004\371\377\375\374\377\376\003\372\373\002\005\374\000\002\004\372\004\372\003\006\375\003\377\376\000\377\374\006\377\374\375\377\373\376\372\375\006\004\371\372\374\375\004\002\372\376\001\001\002\373\000\003\000\371\001\003\377\376\371\376\004\000\003\376\002\006\004\372\007\005\004\376\000\007\372\003\002\005\005\004\372\002\377\006\002\371\375\375\372\376\005\003\000\002\371\005\372\373\377\371\376\005\374\377\007\003\001\376\006\376\001\374\374\001\373\006\376\376\001\372\377\003\006\372\373\003\377\376\000\377\373\004\372\371\376\002\004\004\006\001\372\001\376\005\001\000\000\007\002\375\002\375\375\006\007\375\375\002\006\371\375\002\377\002\377\000\373\001\372\372\001\377\372\001\002\000\375\373\377\372\001\371\372\007\372\001\377\372\004\376\376\374\375\373\373\005\371\375\006\005\007\374\373\005\372\000\001\374\005\000\002\373\004\001\004\006\002\003\373\376\372\374\003\375\005\000\005\373\001\375\374\002\002\000\373\374\003\005\376\003\374\374\373\374\000\004\371\375\372\003\375\005\005\006\007\371\003\372\003\375\004\374\001\376\373\000\004\003\001\003\372\377\003\004\374\000\376\002\377\001\374\376\002\002\001\005\375\373\001\372\000\007\004\007\006\006\000\004\004\006\000\377\375\000\002\374\376\374\006\373\377\000\374\006\373\005\001\001\006\005\373\373\001\003\371\006\372\003\005\372\003\005\006\005\006\001\001\377\372\001\003\372\005\002\376\377\373\005\376\375\373\005\004\007\001\000\002\001\374\004\003\377\004\372\373\373\007\375\002\002\377\373\007\001\004\374\007\376\000\003\376\006\371\377\003\376\003\004\375\006\376\371\373\373\004\000\005\377\372\372\377\004\002\001\000\005\372\004\377\376\375\001\005\375\375\000\003\006\374\004\377\004\006\000\374\003\000\005\376\372\371\371\000\374\372\006\004\006\376\377\001\377\376\373\374\000\003\004\372\375\000\006\002\374\377\004\372\371\373\001\006\377\003\007\377\373\000\371\002\376\003\002\377\006\006\006\371\006\373\377\006\000\374\375\376\001\376\007\003\007\376\004\001\005\003\375\372\003\004\376\374\005\372\372\000\006\377\003\000\002\001\003\375\000\004\375\372\000\001\000\000\002\000\004\005\377\005\007\376\372\001\374\006\002\376\002\005\374\372\000\375\372\372\000\001\000\377\007\376\000\374\375\000\373\003\001\006\003\376\007\374\376\374\005\371\372\001\374\374\002\375\004\001\002\002\376\003\373\000\375\375\005\373\002\376\371\006\004\001\001\371\376\005\377\375\005\003\374\375\002\373\376\001\002\001\007\002\004\376\375\377\376\004\373\000\001\375\377\372\376\002\001\375\006\005\006\004\376\004\004\001\001\377\004\006\003\001\005\006\001\377\000\000\000\372\004\375\004\377\377\006\377\373\003\375\373\004\005\377\006\376\374\374\371\376\003\376\374\001\373\001\375\001\376\376\000\376\371\376\377\372\373\374\374\375\376\003\376\002\372\375\375\007\377\373\377\006\376\377\373\002\001\000\005\004\006\376\001\373\372\371\001\371\001\373\374\001\375\373\003\375\373\005\373\004\377\002\000\002\006\001\373\375\005\376\004\000\376\003\007\000\377\003\004\005\376\004\003\004\006\006\006\371\002\374\375\003\375\000\375\377\004\003\374\373\004\005\375\003\376\001\001\374\003\377\004\006\003\377\001\003\377\377\371\000\374\003\373\374\006\372\372\006\004\375\375\373\004\005\001\373\371\377\376\004\005\373\374\005\000\376\001\002\003\006\006\374\375\374\377\001\373\003\004\372\004\375\001\371\004\002\001\376\377\005\000\376\376\372\005\000\376\004\371\000\377\377\377\373\377\001\004\002\374\373\000\374\377\373\373\374\005\006\374\003\373\000\006\001\003\371\373\006\374\005\005\006\371\002\005\373\000\377\377\003\005\003\004\376\372\000\005\004\371\372\376\371\005\375\000\001\001\000\006\005\006\002\002\000\003\006\374\005\000\373\372\376\002\372\006\003\375\373\373\375\002\004\001\007\373\377\004\005\004\375\005\376\376\004\003\000\004\376\006\001\376\003\376\007\006\002\376\001\376\006\371\006\375\375\004\003\006\377\374\004\003\375\372\374\375\006\377\000\004\373\002\006\373\377\374\372\000\000\376\006\373\372\004\001\006\003\377\006\371\006\006\004\004\005\371\376\001\003\372\005\001\002\373\001\372\375\004\372\006\373\375\001\003\375\377\003\372\374\374\373\006\005\373\002\000\004\376\377\004\374\006\374\006\373\004\375\373\006\376\006\002\002\377\372\372\005\004\375\000\002\374\002\376\007\373\376\371\377\005\376\006\002\006\376\004\372\000\005\002\002\003\006\004\377\007\374\372\372\002\375\377\001\375\005\374\377\003\007\002\005\006\006\000\001\004\000\376\371\001\000\005\004\375\372\375\004\007\371\374\002\005\000\002\002\004\004\007\005\006\373\006\004\002\005\004\376\375\000\372\004\377\003\374\003\376\006\376\006\006\005\002\006\007\002\372\372\377\373\004\373\375\004\004\003\006\002\000\002\376\000\000\005\006\005\372\003\372\006\001\007\372\002\372\004\001\005\002\005\374\372\372\002\372\001\377\002\006\005\000\005\372\375\007\377\375\004\005\003\372\004\005\376\373\001\372\003\371\371\374\005\002\005\374\377\004\002\376\004\373\377\377\377\001\005\372\003\373\375\006\374\007\376\372\006\005\371\377\005\001\003\005\002\006\003\001\377\374\004\376\374\375\376\001\001\001\004\007\007\000\005\001\376\003\376\000\000\001\001\375\371\006\002\001\373\000\377\007\004\002\374\000\001\377\003\374\003\007\373\371\373\001\005\373\372\005\373\375\005\006\372\000\005\007\003\003\377\005\006\004\374\372\375\003\004\000\005\376\374\374\375\375\377\372\000\004\002\005\002\000\374\376\373\373\376\002\374\000\376\373\000\371\373\372\006\000\376\002\375\376\005\372\004\376\375\005\006\006\004\003\002\002\002\002\375\006\377\000\004\375\004\007\004\005\372\374\004\377\003\377\000\375\372\374\372\000\004\007\002\007\372\376\004\371\375\001\001\007\003\000\004\373\001\001\376\002\377\377\006\002\003\373\373\004\372\372\376\372\002\002\002\373\001\375\374\000\004\003\376\003\376\002\373\374\003\372\371\001\375\004\371\374\004\005\002\374\371\001\373\377\374\006\373\006\000\005\005\006\006\002\375\002\001\001\005\375\000\372\371\003\004\375\376\003\377\374\005\007\007\377\374\375\374\376\373\003\002\002\374\377\373\004\375\372\374\003\374\005\376\002\373\376\006\005\374\002\371\005\004\001\373\000\377\374\003\000\001\001\003\372\005\001\371\371\000\375\001\375\372\374\003\373\376\001\371\006\005\004\377\004\376\377\377\003\373\001\372\376\006\372\372\005\374\001\374\004\001\004\375\002\002\373\006\000\001\002\377\371\005\005\374\374\006\003\001\002\001\374\377\372\000\377\374\373\371\007\003\375\373\374\373\374\005\004\005\006\002\374\000\372\001\376\002\373\371\372\374\374\377\005\375\371\002\374\374\005\377\007\004\376\007\373\372\007\007\377\004\002\002\007\377\375\002\005\006\003\002\006\376\003\004\003\000\371\002\002\374\006\373\005\003\003\002\003\376\002\004\377\377\371\007\001\373\376\003\002\007\376\002\005\004\374\003\377\374\003\007\004\377\002\001\003\005\373\377\374\002\377\004\000\000\005\007\002\003\376\371\377\006\372\372\002\372\371\375\000\376\005\372\000\373\372\007\002\001\372\374\375\005\005\004\001\002\002\006\372\001\007\373\375\000\372\005\003\000\375\377\001\003\006\000\376\374\002\375\375\003\001\007\376\377\003\000\005\376\374\005\373\004\377\000\375\002\005\001\001\000\001\375\374\001\006\372\375\376\372\371\001\372\005\004\376\373\006\005\375\006\377\001\001\000\006\000\006\007\003\372\004\375\373\372\372\000\374\001\006\007\376\374\371\373\372\375\003\377\372\377\005\002\006\372\006\004\005\000\376\007\003\372\004\377\006\001\373\375\374\373\373\004\004\375\373\005\376\000\001\375\371\372\005\375\000\002\372\003\004\372\003\374\005\002\374\377\001\005\376\377\374\376\005\376\372\003\373\372\006\372\377\373\006\372\004\006\373\005\375\375\007\374\005\002\374\374\002\002\377\375\376\372\005\375\371\003\005\003\372\377\375\372\002\005\000\006\372\005\371\376\000\001\377\004\004\006\000\377\007\002\006\000\371\375\374\374\001\373\371\002\376\002\000\374\006\001\374\006\005\001\003\376\003\374\003\374\002\007\373\002\004\007\005\374\376\372\372\001\371\002\005\373\376\006\375\372\376\004\003\001\004\376\002\373\006\006\371\372\003\004\006\375\004\007\371\000\000\001\000\374\001\006\002\006\002\000\002\373\372\372\000\372\005\006\004\000\376\372\373\006\007\373\006\373\377\003\375\373\001\377\001\002\376\003\373\002\376\007\371\371\374\006\377\001\002\005\001\376\375\000\377\371\005\372\002\377\375\375\002\375\376\003\003\373\373\005\004\004\373\000\000\007\003\372\375\004\003\376\377\373\376\004\372\004\377\376\007\002\005\003\001\006\006\002\005\373\000\004\000\004\374\372\376\007\002\003\006\002\000\372\001\374\005\376\006\007\373\001\375\004\377\374\375\377\001\377\003\375\005\000\003\376\375\003\377\372\002\006\003\007\005\374\003\006\003\000\375\000\001\000\001\002\374\377\372\004\372\377\377\003\377\007\006\371\003\005\004\007\006\371\006\001\375\001\001\376\002\374\006\375\375\376\377\002\002\007\373\373\374\373\377\001\006\375\375\001\375\373\375\373\372\376\003\371\006\376\376\375\007\377\374\376\377\006\377\001\371\377\007\375\371\005\002\373\003\005\002\371\375\003\003\003\374\000\377\375\003\002\006\006\375\006\002\000\374\373\374\002\003\373\002\375\377\004\006\003\006\000\377\372\375\375\002\002\003\006\003\003\377\373\003\003\003\003\377\004\004\372\377\000\374\375\005\004\005\003\002\375\376\001\376\003\374\002\007\002\376\377\007\006\376\372\374\004\371\004\006\006\374\374\377\374\003\006\371\377\007\372\375\006\374\374\005\372\006\372\371\001\000\375\372\374\373\374\374\374\005\004\002\375\004\007\004\006\002\005\005\372\375\000\004\000\377\004\004\001\374\377\006\003\377\374\000\376\372\376\373\377\006\377\376\002\005\005\372\004\000\001\004\005\373\005\003\371\374\373\000\375\002\375\006\003\001\004\377\374\372\005\006\005\005\005\005\007\372\006\004\006\372\372\002\373\371\001\004\006\374\005\373\004\006\001\005\006\377\006\373\001\373\373\376\375\007\372\374\372\377\004\006\004\375\374\000\007\005\000\002\377\002\372\002\001\377\372\006\002\001\000\376\375\374\003\376\371\005\001\000\002\372\373\375\004\376\371\374\376\000\004\004\376\375\007\374\377\375\377\001\003\005\372\002\376\003\003\375\001\004\001\001\000\002\004\375\375\372\003\003\372\002\375\372\377\373\000\002\371\005\003\001\001\376\372\374\001\001\376\000\001\376\001\376\005\002\374\002\004\004\000\374\007\000\000\006\003\371\376\371\006\005\006\007\002\371\373\005\372\375\006\003\373\005\375\375\373\002\000\375\005\001\372\377\377\373\375\375\374\000\376\372\000\374\001\001\372\375\373\004\374\000\006\375\004\001\006\000\373\001\375\003\372\000\373\376\003\374\005\007\377\373\007\006\002\371\373\377\004\373\001\374\000\001\004\001\005\375\372\002\376\377\371\374\375\371\373\005\376\374\001\377\376\371\375\371\000\375\373\377\006\002\003\005\372\003\004\005\005\004\000\376\372\371\006\000\377\373\003\376\005\007\006\372\004\007\374\375\376\374\000\001\001\375\003\371\001\006\374\376\006\377\000\001\375\006\004\372\371\001\377\377\377\376\006\375\372\000\371\376\002\374\372\006\372\002\006\005\001\376\004\374\002\376\000\004\376\375\000\376\004\000\006\372\005\007\006\002\004\373\373\006\003\007\001\375\007\007\372\004\005\376\005\376\007\002\376\004\373\373\376\004\372\375\373\374\001\000\375\004\375\375\377\004\001\377\002\376\004\377\001\001\374\376\374\377\377\001\000\000\377\373\374\002\006\001\375\376\000\000\374\006\004\004\004\375\001\376\001\002\373\006\006\376\002\005\005\374\373\377\376\004\005\374\000\376\002\375\376\004\373\001\377\377\002\377\373\372\371\003\003\372\006\000\002\003\005\375\371\375\004\376\374\007\375\371\002\374\000\375\005\006\374\373\004\371\000\007\376\001\375\377\372\372\373\005\005\001\372\377\371\377\375&#34;
      }
      type: TENSOR
    }
  }
  node {
    output: &#34;/linear/export_handler/Constant_1_output_0&#34;
    name: &#34;/linear/export_handler/Constant_1&#34;
    op_type: &#34;Constant&#34;
    attribute {
      name: &#34;value&#34;
      t {
        data_type: 1
        raw_data: &#34;\263-\341&lt;&#34;
      }
      type: TENSOR
    }
  }
  node {
    output: &#34;/linear/export_handler/Constant_2_output_0&#34;
    name: &#34;/linear/export_handler/Constant_2&#34;
    op_type: &#34;Constant&#34;
    attribute {
      name: &#34;value&#34;
      t {
        dims: 128
        data_type: 6
        raw_data: &#34;\271\377\377\377\032\003\000\0009\001\000\000\302\002\000\000;\375\377\377\031\000\000\000\024\003\000\000d\003\000\000\327\374\377\377\363\377\377\377u\003\000\000\374\000\000\000t\000\000\000\321\002\000\000\236\377\377\377\241\377\377\377\237\375\377\377\010\000\000\000\350\002\000\000}\376\377\377\267\377\377\377\374\000\000\000\355\001\000\000N\375\377\377\\\002\000\000\346\002\000\000\317\000\000\000\207\001\000\000?\000\000\000\302\002\000\000Y\377\377\377\326\376\377\377\\\003\000\000\374\376\377\377\334\000\000\000\200\001\000\000\362\377\377\377+\000\000\000\304\375\377\377u\000\000\000\340\000\000\000\275\001\000\000\324\377\377\377\332\000\000\000\026\001\000\000\333\001\000\000\371\375\377\377\363\000\000\000|\002\000\000\335\376\377\377\226\375\377\377\335\002\000\0002\001\000\000F\377\377\377\006\003\000\000\310\375\377\377\344\377\377\377\177\376\377\377&gt;\001\000\000\033\002\000\000I\003\000\000\006\376\377\377\315\375\377\377\033\003\000\000\236\000\000\000@\376\377\377\031\002\000\000\321\002\000\000;\000\000\000\035\377\377\377\354\377\377\377Z\001\000\000N\375\377\377I\001\000\000\030\001\000\000w\377\377\377\303\002\000\000\022\000\000\000\377\001\000\000!\000\000\000\035\001\000\000\003\375\377\377^\377\377\377\336\374\377\377p\377\377\377\351\002\000\000X\376\377\377\247\000\000\000H\376\377\377}\000\000\000\225\374\377\3776\001\000\000\301\001\000\000\210\001\000\000\374\376\377\377\307\377\377\377\320\374\377\377\267\377\377\377F\375\377\377\352\377\377\377=\377\377\3770\376\377\377#\000\000\000\313\376\377\377\334\000\000\000\261\001\000\000\363\001\000\000\037\001\000\000\220\377\377\377\202\000\000\000d\377\377\377\013\002\000\000\266\002\000\000\347\374\377\377+\001\000\000\301\376\377\377\341\377\377\377O\003\000\000\037\375\377\377\244\375\377\377\352\000\000\000\302\001\000\000I\002\000\000~\377\377\377*\376\377\377\333\000\000\000\214\000\000\000\014\002\000\000&#34;
      }
      type: TENSOR
    }
  }
  node {
    input: &#34;/input_quant/export_handler/QuantizeLinear_output_0&#34;
    input: &#34;/input_quant/export_handler/Constant_output_0&#34;
    input: &#34;/input_quant/export_handler/Constant_1_output_0&#34;
    input: &#34;/linear/export_handler/Constant_output_0&#34;
    input: &#34;/linear/export_handler/Constant_1_output_0&#34;
    input: &#34;/input_quant/export_handler/Constant_1_output_0&#34;
    input: &#34;/input_quant/export_handler/Constant_output_0&#34;
    input: &#34;/input_quant/export_handler/Constant_1_output_0&#34;
    input: &#34;/linear/export_handler/Constant_2_output_0&#34;
    output: &#34;/linear/export_handler/QLinearConv_output_0&#34;
    name: &#34;/linear/export_handler/QLinearConv&#34;
    op_type: &#34;QLinearConv&#34;
    attribute {
      name: &#34;dilations&#34;
      ints: 1
      ints: 1
      type: INTS
    }
    attribute {
      name: &#34;group&#34;
      i: 1
      type: INT
    }
    attribute {
      name: &#34;kernel_shape&#34;
      ints: 3
      ints: 3
      type: INTS
    }
    attribute {
      name: &#34;pads&#34;
      ints: 0
      ints: 0
      ints: 0
      ints: 0
      type: INTS
    }
    attribute {
      name: &#34;strides&#34;
      ints: 1
      ints: 1
      type: INTS
    }
  }
  node {
    input: &#34;/linear/export_handler/QLinearConv_output_0&#34;
    input: &#34;/input_quant/export_handler/Constant_output_0&#34;
    input: &#34;/input_quant/export_handler/Constant_1_output_0&#34;
    output: &#34;10&#34;
    name: &#34;/linear/export_handler/DequantizeLinear&#34;
    op_type: &#34;DequantizeLinear&#34;
  }
  name: &#34;torch_jit&#34;
  input {
    name: &#34;inp.1&#34;
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
            dim_value: 1
          }
          dim {
            dim_value: 3
          }
          dim {
            dim_value: 128
          }
          dim {
            dim_value: 128
          }
        }
      }
    }
  }
  output {
    name: &#34;10&#34;
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
            dim_value: 1
          }
          dim {
            dim_value: 128
          }
          dim {
            dim_value: 126
          }
          dim {
            dim_value: 126
          }
        }
      }
    }
  }
}
opset_import {
  domain: &#34;&#34;
  version: 13
}
</pre></div></div>
</div>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[9]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">show_netron</span><span class="p">(</span><span class="s2">&quot;quant_model_qop.onnx&quot;</span><span class="p">,</span> <span class="mi">8085</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area docutils container">
<div class="highlight"><pre>
Serving &#39;quant_model_qop.onnx&#39; at http://localhost:8085
</pre></div></div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[9]:
</pre></div>
</div>
<div class="output_area rendered_html docutils container">
<iframe
    width="100%"
    height="400"
    src="http://localhost:8085/"
    frameborder="0"
    allowfullscreen

></iframe></div>
</div>
<p>In this case, we need to make sure that our input to <code class="docutils literal notranslate"><span class="pre">QuantLinear</span></code> is quantized. Using the approach shown above, with a standalone <code class="docutils literal notranslate"><span class="pre">QuantIdentity</span></code>, Brevitas will add a <code class="docutils literal notranslate"><span class="pre">QuantizeLinear</span></code> node. If <code class="docutils literal notranslate"><span class="pre">return_quant_tensor=True</span></code> is specified in <code class="docutils literal notranslate"><span class="pre">QuantIdentity</span></code>, a <code class="docutils literal notranslate"><span class="pre">DeQuantizeLinear</span></code> node won’t be added. Setting <code class="docutils literal notranslate"><span class="pre">input_quant</span></code> in <code class="docutils literal notranslate"><span class="pre">QuantConv2d</span></code> is also an option.</p>
<p>Note that the way <code class="docutils literal notranslate"><span class="pre">return_quant_tensor=True</span></code> is interpreted differs between QCDQ export and QOps export. With QCDQ, it doesn’t affect the export, as a dequantize node is always generated. With QOps export, it prevents a quantization node from being inserted, so that an integer tensor is passed to the next layer.</p>
<p>Moreover, our <code class="docutils literal notranslate"><span class="pre">QuantLinear</span></code> layer has to specify how to re-quantize the output - in this case, with the <code class="docutils literal notranslate"><span class="pre">Int8ActPerTensorFloat</span></code> activation quantizer, otherwise an error will be raised during export-time.</p>
<p>Similarly, if the bias is present, it has to be quantized or an error will be raised.</p>
<section id="Clipping-in-QOps">
<h3>Clipping in QOps<a class="headerlink" href="#Clipping-in-QOps" title="Permalink to this heading">#</a></h3>
<p>Even when using <code class="docutils literal notranslate"><span class="pre">QLinearConv</span></code> and <code class="docutils literal notranslate"><span class="pre">QLinearMatMul</span></code>, it is still possible to represent bit-width &lt; 8 through the use of clipping.</p>
<p>However, in this case the <code class="docutils literal notranslate"><span class="pre">Clip</span></code> operation over the weights won’t be captured in the exported ONNX graph. Instead, it will be performed at export-time, and the clipped tensor will be exported in the ONNX graph.</p>
<p>Examining the last exported model, it is possible to see that the weight tensor, even though it has Int8 has type, has a min/max values equal to <code class="docutils literal notranslate"><span class="pre">[-7,</span> <span class="pre">7]</span></code>, given that it is quantized at 4 bit with narrow_range set to True.</p>
</section>
</section>
<section id="ONNX-Runtime">
<h2>ONNX Runtime<a class="headerlink" href="#ONNX-Runtime" title="Permalink to this heading">#</a></h2>
<section id="QCDQ">
<h3>QCDQ<a class="headerlink" href="#QCDQ" title="Permalink to this heading">#</a></h3>
<p>Since for QCDQ we are only using standard ONNX operation, it is possible to run the exported model using ONNX Runtime.</p>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">onnxruntime</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">ort</span>

<span class="k">class</span><span class="w"> </span><span class="nc">Model</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">linear</span> <span class="o">=</span> <span class="n">qnn</span><span class="o">.</span><span class="n">QuantLinear</span><span class="p">(</span><span class="n">IN_CH</span><span class="p">,</span> <span class="n">OUT_CH</span><span class="p">,</span> <span class="n">bias</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">weight_bit_width</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">act</span> <span class="o">=</span> <span class="n">qnn</span><span class="o">.</span><span class="n">QuantReLU</span><span class="p">(</span><span class="n">bit_width</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>

    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">inp</span><span class="p">):</span>
        <span class="n">inp</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">linear</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span>
        <span class="n">inp</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">act</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span>
        <span class="k">return</span> <span class="n">inp</span>

<span class="n">model</span> <span class="o">=</span> <span class="n">Model</span><span class="p">()</span>
<span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
<span class="n">inp</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">BATCH_SIZE</span><span class="p">,</span> <span class="n">IN_CH</span><span class="p">)</span>
<span class="n">path</span> <span class="o">=</span> <span class="s1">&#39;quant_model_3b_4b_qcdq.onnx&#39;</span>

<span class="n">exported_model</span> <span class="o">=</span> <span class="n">export_onnx_qcdq</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="n">inp</span><span class="p">,</span> <span class="n">export_path</span><span class="o">=</span><span class="n">path</span><span class="p">,</span> <span class="n">opset_version</span><span class="o">=</span><span class="mi">13</span><span class="p">)</span>

<span class="n">sess_opt</span> <span class="o">=</span> <span class="n">ort</span><span class="o">.</span><span class="n">SessionOptions</span><span class="p">()</span>
<span class="n">sess</span> <span class="o">=</span> <span class="n">ort</span><span class="o">.</span><span class="n">InferenceSession</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">sess_opt</span><span class="p">)</span>
<span class="n">input_name</span> <span class="o">=</span> <span class="n">sess</span><span class="o">.</span><span class="n">get_inputs</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">name</span>
<span class="n">pred_onx</span> <span class="o">=</span> <span class="n">sess</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="kc">None</span><span class="p">,</span> <span class="p">{</span><span class="n">input_name</span><span class="p">:</span> <span class="n">inp</span><span class="o">.</span><span class="n">numpy</span><span class="p">()})[</span><span class="mi">0</span><span class="p">]</span>


<span class="n">out_brevitas</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span>
<span class="n">out_ort</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span><span class="n">pred_onx</span><span class="p">)</span>

<span class="k">assert</span> <span class="n">torch</span><span class="o">.</span><span class="n">allclose</span><span class="p">(</span><span class="n">out_brevitas</span><span class="p">,</span> <span class="n">out_ort</span><span class="p">)</span>
</pre></div>
</div>
</div>
<section id="QGEMM-vs-GEMM">
<h4>QGEMM vs GEMM<a class="headerlink" href="#QGEMM-vs-GEMM" title="Permalink to this heading">#</a></h4>
<p>QCDQ allows to execute low precision fake-quantization in ONNX Runtime, meaning operations actually happen among floating-point values. ONNX Runtime is also capable of optimizing and accelerating a QCDQ model leveraging a int8 based QGEMM kernels in some scenarios.</p>
<p>This seems to happen only when using a <code class="docutils literal notranslate"><span class="pre">QuantLinear</span></code> layer, with the following requirements: - Input, Weight, Bias, and Output tensors must be quantized; - Bias tensor must be present, and quantized with bitwidth &gt; 8. - The output of the QuantLinear must be re-quantized. - The output bit-width must be equal to 8. - The input bit-width must be equal to 8. - The weights bit-width can be &lt;= 8. - The weights can be quantized per-tensor or per-channel.</p>
<p>We did not observe a similar behavior for other operations such as <code class="docutils literal notranslate"><span class="pre">QuantConvNd</span></code>.</p>
<p>An example of a layer that will match this definition is the following:</p>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[11]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">brevitas.quant.scaled_int</span><span class="w"> </span><span class="kn">import</span> <span class="n">Int32Bias</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">brevitas.quant.scaled_int</span><span class="w"> </span><span class="kn">import</span> <span class="n">Int8ActPerTensorFloat</span>

<span class="n">qgemm_ort</span> <span class="o">=</span> <span class="n">qnn</span><span class="o">.</span><span class="n">QuantLinear</span><span class="p">(</span>
    <span class="n">IN_CH</span><span class="p">,</span> <span class="n">OUT_CH</span><span class="p">,</span>
    <span class="n">weight_bit_width</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
    <span class="n">input_quant</span><span class="o">=</span><span class="n">Int8ActPerTensorFloat</span><span class="p">,</span>
    <span class="n">output_quant</span><span class="o">=</span><span class="n">Int8ActPerTensorFloat</span><span class="p">,</span>
    <span class="n">bias</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">bias_quant</span><span class="o">=</span><span class="n">Int32Bias</span><span class="p">)</span>
</pre></div>
</div>
</div>
<p>Unfortunately ONNX Runtime does not provide a built-in way to log whether execution goes through unoptimized floating-point GEMM, or int8 QGEMM.</p>
</section>
</section>
<section id="QOps">
<h3>QOps<a class="headerlink" href="#QOps" title="Permalink to this heading">#</a></h3>
<p>As for the QCDQ case, also in this case we are using only standard ONNX operations, thus we can use ONNX Runtime for executing our exported models.</p>
<div class="nbinput docutils container">
<div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[13]:
</pre></div>
</div>
<div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">onnxruntime</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">ort</span>

<span class="k">class</span><span class="w"> </span><span class="nc">Model</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">input_quant</span> <span class="o">=</span> <span class="n">qnn</span><span class="o">.</span><span class="n">QuantIdentity</span><span class="p">(</span><span class="n">return_quant_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">conv</span> <span class="o">=</span> <span class="n">qnn</span><span class="o">.</span><span class="n">QuantConv2d</span><span class="p">(</span>
            <span class="n">IN_CH</span><span class="p">,</span> <span class="n">OUT_CH</span><span class="p">,</span> <span class="n">kernel_size</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">bias</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
            <span class="n">weight_bit_width</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
            <span class="n">output_quant</span><span class="o">=</span><span class="n">Int8ActPerTensorFloat</span><span class="p">,</span>
            <span class="n">output_bit_width</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
            <span class="n">return_quant_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>

    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">inp</span><span class="p">):</span>
        <span class="n">inp</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_quant</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span>
        <span class="n">inp</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">conv</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span>
        <span class="k">return</span> <span class="n">inp</span>

<span class="n">model</span> <span class="o">=</span> <span class="n">Model</span><span class="p">()</span>
<span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
<span class="n">inp</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">BATCH_SIZE</span><span class="p">,</span> <span class="n">IN_CH</span><span class="p">,</span> <span class="n">IMG_SIZE</span><span class="p">,</span> <span class="n">IMG_SIZE</span><span class="p">)</span>
<span class="n">path</span> <span class="o">=</span> <span class="s1">&#39;quant_model_qops_4b_4b.onnx&#39;</span>

<span class="n">exported_model</span> <span class="o">=</span> <span class="n">export_onnx_qop</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="n">inp</span><span class="p">,</span> <span class="n">export_path</span><span class="o">=</span><span class="n">path</span><span class="p">)</span>

<span class="n">sess_opt</span> <span class="o">=</span> <span class="n">ort</span><span class="o">.</span><span class="n">SessionOptions</span><span class="p">()</span>
<span class="n">sess</span> <span class="o">=</span> <span class="n">ort</span><span class="o">.</span><span class="n">InferenceSession</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">sess_opt</span><span class="p">)</span>
<span class="n">input_name</span> <span class="o">=</span> <span class="n">sess</span><span class="o">.</span><span class="n">get_inputs</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">name</span>
<span class="n">pred_onx</span> <span class="o">=</span> <span class="n">sess</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="kc">None</span><span class="p">,</span> <span class="p">{</span><span class="n">input_name</span><span class="p">:</span> <span class="n">inp</span><span class="o">.</span><span class="n">numpy</span><span class="p">()})[</span><span class="mi">0</span><span class="p">]</span>


<span class="n">out_brevitas</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span><span class="o">.</span><span class="n">int</span><span class="p">()</span>
<span class="n">out_ort</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span><span class="n">pred_onx</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int8</span><span class="p">)</span>

<span class="k">assert</span> <span class="n">torch</span><span class="o">.</span><span class="n">allclose</span><span class="p">(</span><span class="n">out_brevitas</span><span class="p">,</span> <span class="n">out_ort</span><span class="p">,</span> <span class="n">atol</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="nboutput nblast docutils container">
<div class="prompt empty docutils container">
</div>
<div class="output_area stderr docutils container">
<div class="highlight"><pre>
c:\users\alessand\documents\brevitas\src\brevitas\export\onnx\standard\manager.py:23: UserWarning: ONNX opset version set to 13, override with opset_version=
  warnings.warn(f&#34;ONNX opset version set to {DEFAULT_OPSET}, override with {ka}=&#34;)
</pre></div></div>
</div>
<p>Note a few things. <code class="docutils literal notranslate"><span class="pre">QuantConv2d</span></code> defines <code class="docutils literal notranslate"><span class="pre">return_quant_tensor=True</span></code> so that the exported ONNX model doesn’t have a dequantize node at the end. Vecause we set <code class="docutils literal notranslate"><span class="pre">output_bit_width=4</span></code> (overriding the 8 bit bit-width in the <code class="docutils literal notranslate"><span class="pre">output_quant</span></code> quantizer), we have a <code class="docutils literal notranslate"><span class="pre">Clip</span></code> node at the end. At the same time, within Brevitas, <code class="docutils literal notranslate"><span class="pre">return_quant_tensor=True</span></code> means the PyTorch model returns a Brevitas <code class="docutils literal notranslate"><span class="pre">QuantTensor</span></code>, from which we are taking the <code class="docutils literal notranslate"><span class="pre">int</span></code> representation.</p>
<p>Due to differences in how the computation is performed between Brevitas and ONNX Runtime, it might happen the two results are slightly different (since Brevitas uses a style closer to QCDQ, rather than operating between integers), thus we added a tolerance for off-by-1 errors.</p>
</section>
</section>
</section>


                </article>
              
              
              
              
              
                <footer class="prev-next-footer d-print-none">
                  
<div class="prev-next-area">
    <a class="left-prev"
       href="quant_recurrent.html"
       title="previous page">
      <i class="fa-solid fa-angle-left"></i>
      <div class="prev-next-info">
        <p class="prev-next-subtitle">previous</p>
        <p class="prev-next-title">Quantized RNNs and LSTMs</p>
      </div>
    </a>
    <a class="right-next"
       href="../settings.html"
       title="next page">
      <div class="prev-next-info">
        <p class="prev-next-subtitle">next</p>
        <p class="prev-next-title">Settings</p>
      </div>
      <i class="fa-solid fa-angle-right"></i>
    </a>
</div>
                </footer>
              
            </div>
            
            
              
                <div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">


  <div class="sidebar-secondary-item">
<div
    id="pst-page-navigation-heading-2"
    class="page-toc tocsection onthispage">
    <i class="fa-solid fa-list"></i> On this page
  </div>
  <nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
    <ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Requirements">Requirements</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Introduction">Introduction</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#QuantizeLinear-Clip-DeQuantizeLinear-(QCDQ)">QuantizeLinear-Clip-DeQuantizeLinear (QCDQ)</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Basic-Example">Basic Example</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Complete-Model">Complete Model</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#The-C-in-QCDQ-(Bitwidth-&lt;=-8)">The C in QCDQ (Bitwidth &lt;= 8)</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#QOps-Export">QOps Export</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Clipping-in-QOps">Clipping in QOps</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ONNX-Runtime">ONNX Runtime</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#QCDQ">QCDQ</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#QGEMM-vs-GEMM">QGEMM vs GEMM</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#QOps">QOps</a></li>
</ul>
</li>
</ul>
  </nav></div>

  <div class="sidebar-secondary-item">

  <div class="tocsection sourcelink">
    <a href="../_sources/tutorials/onnx_export.nblink.txt">
      <i class="fa-solid fa-file-lines"></i> Show Source
    </a>
  </div>
</div>

</div></div>
              
            
          </div>
          <footer class="bd-footer-content">
            
          </footer>
        
      </main>
    </div>
  </div>
  
  <!-- Scripts loaded after <body> so the DOM is not blocked -->
  <script src="../_static/scripts/bootstrap.js?digest=3ee479438cf8b5e0d341"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=3ee479438cf8b5e0d341"></script>

  <footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
  
    <div class="footer-items__start">
      
        <div class="footer-item">

  <p class="copyright">
    
      © Copyright 2025 - Advanced Micro Devices, Inc..
      <br/>
    
  </p>
</div>
      
        <div class="footer-item">

  <p class="sphinx-version">
    Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 5.3.0.
    <br/>
  </p>
</div>
      
    </div>
  
  
  
    <div class="footer-items__end">
      
        <div class="footer-item">
<script>
document.write(`
  <div class="version-switcher__container dropdown">
    <button id="pst-version-switcher-button-2"
      type="button"
      class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle"
      data-bs-toggle="dropdown"
      aria-haspopup="listbox"
      aria-controls="pst-version-switcher-list-2"
      aria-label="Version switcher list"
    >
      Choose version  <!-- this text may get changed later by javascript -->
      <span class="caret"></span>
    </button>
    <div id="pst-version-switcher-list-2"
      class="version-switcher__menu dropdown-menu list-group-flush py-0"
      role="listbox" aria-labelledby="pst-version-switcher-button-2">
      <!-- dropdown will be populated by javascript on page load -->
    </div>
  </div>
`);
</script></div>
      
    </div>
  
</div>

  </footer>
  </body>
</html>